First, we need to load the necessary R packages:
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(factoextra)
library(scorecard)
library(glmnet)
library(ggplot2)
library(plotly)
library(dplyr)
library(xefun)
library(modeest)
library(cluster)
library(GA)
library(dendextend)
library(parallel)
library(ROCR)
library(gridExtra)
library(grid)
library(writexl)
library(openxlsx)
library(clusterSim)
library(LLM)
library(ROCR)
library(verification)
library(plotROC)
library(pROC)
library(xgboost)
library(Matrix)
library(dbscan)
library(caret)
library(e1071)
library(rpart.plot)
library(factoextra)
library(reshape2)
library(knitr)
train_data <- read.csv("C:/Users/81208/Documents/df20k_selected_ids_hash.csv")
train_data2 <- read.csv("C:/Users/81208/Documents/دیتا پایان نامه Ù…ØÙ…دی/df20k_selected_ids_hash.csv")
train_data $ Age <- train_data2$Age
train_data $ DSinceADueActH1 <- train_data2$DSinceADueActH1
train_data $ NMonthWithNoDelinq1ContrL36M <- train_data2$NMonthWithNoDelinq1ContrL36M
train_data $ NContrWithAPastDueL36M <- train_data2$NContrWithAPastDueL36M
train_data $ aaReltiveAvgOutL6xL12M <- train_data2$aaReltiveAvgOutL6xL12M
train_data $ Default <- train_data2$Default
######change str
type_counts <- table(train_data$type)
print(type_counts)
train_data <- train_data %>%
mutate(
type = as.numeric(factor(type), levels = c("نامعلوم", "شاغل", "بازنشسته")))
######handle missing
train_data <- train_data %>%
mutate_all(~ ifelse(trimws(.) == "" | . %in% c("missing", "missing_snapshot" , "missing_contract"), NA, .))
C <- round(mean(train_data$DSinceADueActH1 , na.rm= TRUE), 1)
train_data$DSinceADueActH1[is.na(train_data$DSinceADueActH1)] <- C
row.names(train_data) <- train_data$CustomerId_hash
train_data <- dplyr::select(train_data , -CustomerId_hash , -RegBirthDate)
train_data <- train_data[!is.na(train_data$A_Max_TotalAmount_L36M_Main) ,]
train_data <- train_data[train_data$Gender != 0 , ]
train_data[] <- lapply(train_data , function(x)
as.numeric(as.character(x)))
#####na
na_counts <- sapply(train_data, function(x) sum(is.na(x)))
print(na_counts)
str(train_data)
# Check for missing values and handle them
train_data[is.na(train_data)] <- 0
train_data[] <- lapply(train_data, function(x) {
if (all(!is.na(as.numeric(as.character(x))))) {
return(as.numeric(as.character(x)))
} else {
return(x)
}
})
# Check for missing values and duplicates
sum(is.na(train_data))
sum(duplicated(train_data))
a1 <- max(train_data$SumBouncedAmount_HalfOrEmpty_InL36M)
train_data["SumBouncedAmount_HalfOrEmpty_InL36M"][train_data["SumBouncedAmount_HalfOrEmpty_InL36M"] == a1] <- 1.5e+9
a2 <- max(train_data$MaxBouncedAmount_HalfOrEmpty_InL36M)
train_data["MaxBouncedAmount_HalfOrEmpty_InL36M"][train_data["MaxBouncedAmount_HalfOrEmpty_InL36M"] == a2] <- 1.5e+9
a3 <- max(train_data$MinBouncedAmount_HalfOrEmpty_InL36M)
train_data["MinBouncedAmount_HalfOrEmpty_InL36M"][train_data["MinBouncedAmount_HalfOrEmpty_InL36M"] == a3] <- 1.5e+9
a4 <- max(train_data$AvgBouncedAmount_HalfOrEmpty_InL36M)
train_data["AvgBouncedAmount_HalfOrEmpty_InL36M"][train_data["AvgBouncedAmount_HalfOrEmpty_InL36M"] == a4] <- 1.5e+9
b1 <- max(train_data$A_Avg_TotalAmount_NegStat_L36M_Act_Main)
train_data["AvgBouncedAmount_HalfOrEmpty_InL36M"][train_data["AvgBouncedAmount_HalfOrEmpty_InL36M"] == b1] <- 1.5e+9
b2 <- max(train_data$A_Max_TotalAmount_NegStat_L36M_Act_Main)
train_data["AvgBouncedAmount_HalfOrEmpty_InL36M"][train_data["AvgBouncedAmount_HalfOrEmpty_InL36M"] == b2] <- 1.5e+9
b3 <- max(train_data$A_Min_TotalAmount_NegStat_L36M_Act_Main)
train_data["AvgBouncedAmount_HalfOrEmpty_InL36M"][train_data["AvgBouncedAmount_HalfOrEmpty_InL36M"] == b3] <- 1.5e+9
######outliers
columns_to_replace <- c("A_Max_TotalAmount_L36M_Main",
"A_Avg_TotalAmount_L36M_Main", "A_Max_TotalAmount_L36M_Main" ,
"A_Max_TotalAmount_NegStat_L36M_Main",
"A_Avg_TotalAmount_NegStat_L36M_Main", "A_Max_TotalAmount_L36M_Act_Main",
"A_Min_TotalAmount_L36M_Act_Main",
"A_Avg_TotalAmount_L36M_Act_Main" ,
"A_Min_TotalAmount_L36M_Main", "A_Min_TotalAmount_NegStat_L36M_Main","A_Max_TotalAmount_NegStat_L36M_Main"
)
replace_outliers_specific_columns <- function(train_data, columns) {
df_replaced <- train_data
for (col in columns) {
Q1 <- quantile(train_data[[col]], 0.25)
Q3 <- quantile(train_data[[col]], 0.75)
IQR_value <- IQR(train_data[[col]])
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outlier_condition <- train_data[[col]] < lower_bound | train_data[[col]] > upper_bound
df_replaced[[col]][train_data[[col]] < lower_bound] <- Q1
df_replaced[[col]][train_data[[col]] > upper_bound] <- Q3
}
return(df_replaced)
}
df_replaced <- replace_outliers_specific_columns(train_data, columns_to_replace)
v3 <- max(df_replaced$SumBouncedAmount_HalfOrEmpty_InL36M)
v3 <- 2e+10
df_replaced <- df_replaced[df_replaced$SumBouncedAmount_HalfOrEmpty_InL36M <= v3 , ]
v4 <- max(df_replaced$MaxBouncedAmount_HalfOrEmpty_InL36M)
v4 <- 1e+10
df_replaced <- df_replaced[df_replaced$MaxBouncedAmount_HalfOrEmpty_InL36M <= v4 , ]
df_long <- melt(df_replaced)
ggplot(df_long, aes(x = variable, y = value)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Boxplots for All Columns",
x = "Columns",
y = "Values") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
#########Scale
df_replaced$Age <- df_replaced$Age /100
df1 <- dplyr::select(df_replaced, -Default , -Age)
scaled_data <- preProcess(df1, method = "range")
train_data_scaled <- predict(scaled_data, df_replaced)
df_long1 <- melt(train_data_scaled)
ggplot(df_long1, aes(x = variable, y = value)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Boxplots for All Columns",
x = "Columns",
y = "Values") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
#preProcValues <- preProcess(df1, method = c("center", "scale"))
#df_transformed <- predict(preProcValues, df1)
df2 <- train_data_scaled
df2 $ Gender <- df_replaced$Gender
df2 $ Default <- df_replaced$Default
row.names(df2) <- NULL
iv = iv(df2, y = 'Default') %>%
as_tibble() %>%
mutate( info_value = round(info_value, 3) ) %>%
arrange( desc(info_value) )
iv %>%
knitr::kable()
## part4_feature_selection
### PCA
set.seed(1)
df3 <- dplyr::select(df2, -Default)
# Perform PCA
data_pca <- prcomp(df3, center = TRUE, scale. = F)
loadings <- df3$rotation
fviz_eig(data_pca, addlabels = TRUE)
# Check the PCA output
eig.val<-get_eigenvalue(data_pca)
eig.val
###plot
var<-get_pca_var(data_pca)
pc1<-fviz_contrib(data_pca, "var", axes=1, xtickslab.rt=90) # default angle=45°
plot(pc1,main = "Variables percentage contribution of first Principal Components")
pc2<-fviz_contrib(data_pca, "var", axes=2, xtickslab.rt=90) # default angle=45°
plot(pc2,main = "Variables percentage contribution of second Principal Components")
pc3<-fviz_contrib(data_pca, "var", axes=3, xtickslab.rt=90) # default angle=45°
plot(pc3,main = "Variables percentage contribution of third Principal Components")
pc4<-fviz_contrib(data_pca, "var", axes=4, xtickslab.rt=90) # default angle=45°
plot(pc4,main = "Variables percentage contribution of fourt Principal Components")
pc5<-fviz_contrib(data_pca, "var", axes=5, xtickslab.rt=90) # default angle=45°
plot(pc5,main = "Variables percentage contribution of fourt Principal Components")
######################################
### Rename columns to f1, f2, f3, ...
df1_new <- df3
new_col_names <- paste0("f", seq_along(colnames(df1_new)))
names(df1_new) <- new_col_names
# Create a mapping table of original and new column names
original_col_names <- colnames(dplyr::select(df3))
data_pca1 <- prcomp(df1_new, center = TRUE, scale. = F)
loadings <- data_pca1$rotation
fviz_pca_var(data_pca1,
col.var = "cos2", # Color by the quality of representation
gradient.cols = c("darkorchid4", "gold", "darkorange"),
repel = TRUE
)
# Determine the optimal number of clusters using the Elbow method on PCA components
data_pca_final<-prcomp(df3, center=FALSE, scale.=FALSE, rank. = 6)
results <- data_pca_final$x
### K-means on all variable
#####cluster
set.seed(2)
wss2 <- function(k) {
kmeans(df3, centers = k, iter.max = 100, nstart = 50)$tot.withinss
}
k.values <- 1:10
wss_values2 <- sapply(k.values, wss2)
# Elbow method
plot(k.values, wss_values2, type = 'b', xlab = 'Number of Clusters', ylab = 'Total Within-Cluster Sum of Squares', main = 'Elbow Method')
# Perform k-means clustering with 4 cluster
set.seed(3)
km4 <- kmeans(df3, centers = 4, nstart =100)
Customers_Segments_km4_all <- data.frame(df2, cluster = as.factor(km4$cluster))
pkm4 <- plot_ly(Customers_Segments_km4_all, x=~data_pca_final$x[,1], y=~data_pca_final$x[,2], z=~data_pca_final$x[,3], color=~cluster) %>%
add_markers(size=1.5)
pkm4
wss1 <- function(k) {
kmeans(results, centers = k, iter.max = 200, nstart = 100)$tot.withinss
}
wss_values1 <- sapply(k.values, wss1)
# Elbow method
plot(k.values, wss_values1, type = 'b', xlab = 'Number of Clusters', ylab = 'Total Within-Cluster Sum of Squares', main = 'Elbow Method')
# Perform k-means clustering with 4 cluster
set.seed(4)
km_res4 <- kmeans(results, centers = 4, nstart =100)
Customers_Segments_km4 <- data.frame(results, cluster = as.factor(km_res4$cluster))
pkm_res4 <- plot_ly(Customers_Segments_km4, x=~Customers_Segments_km4[,1], y=~Customers_Segments_km4[,2],
z=~Customers_Segments_km4[,3], color=~cluster) %>%
add_markers(size=1.5)
pkm_res4
km_res4$size
km_res4$centers
df2$groupkm <-km_res4$cluster
g1<- df2[df2$groupkm==1,]
g2<- df2[df2$groupkm==2,]
g3<- df2[df2$groupkm==3,]
g4<- df2[df2$groupkm==4,]
# Step 1: Calculate means for each variable in each cluster
cluster_means <- aggregate(. ~ groupkm, data = df2, FUN = mean)
print(cluster_means)
# Step 2: Perform ANOVA for each variable across clusters
anova_results <- lapply(names(df2)[-ncol(df2)], function(var) {
aov_formula <- as.formula(paste(var, "~ groupkm"))
summary(aov(aov_formula, data = df2))
})
names(anova_results) <- names(df2)[-ncol(df2)]
#A_Avg_TotalAmount_NegStat_L36M_Main
df2 %>%
ggplot(aes(x = as.factor(groupkm), y = A_Avg_TotalAmount_NegStat_L36M_Main)) +
geom_boxplot() +
theme(
legend.position = "none",
plot.title = element_text(size = 11)
) +
ggtitle("Boxplot of A_Avg_TotalAmount_NegStat_L36M_Main by groupkm") +
xlab("GroupKM") +
ylab("A_Avg_TotalAmount_NegStat_L36M_Main")
#A_Max_TotalAmount_L36M_Main
df2 %>%
ggplot(aes(x = as.factor(groupkm), y = A_Max_TotalAmount_L36M_Main)) +
geom_boxplot() +
theme(
legend.position = "none",
plot.title = element_text(size = 11)
) +
ggtitle("Boxplot of A_Max_TotalAmount_L36M_Main by groupkm") +
xlab("GroupKM") +
ylab("A_Max_TotalAmount_L36M_Main")
#A_Avg_TotalAmount_L36M_Main
df2 %>%
ggplot(aes(x = as.factor(groupkm), y = A_Avg_TotalAmount_L36M_Main)) +
geom_boxplot() +
theme(
legend.position = "none",
plot.title = element_text(size = 11)
) +
ggtitle("Boxplot of A_Avg_TotalAmount_L36M_Main by groupkm") +
xlab("GroupKM") +
ylab("A_Avg_TotalAmount_L36M_Main")
Defaultdf <- df2[df2$Default == 1, ]
#Default
Defaultdf %>%
ggplot(aes(x = as.factor(groupkm))) +
geom_bar( stat = "count" , color= "black") +
geom_text(stat = "count" , aes(label = ..count..) , vjust = -0.5 , size =4)+
labs(title = "default = 1 by kmgroups" , x = "Groups" , y= "count of default = 1")+
theme_minimal()
noDefaultdf <- df2[df2$Default == 0, ]
#noDefault
noDefaultdf %>%
ggplot(aes(x = as.factor(groupkm))) +
geom_bar( stat = "count" , color= "black") +
geom_text(stat = "count" , aes(label = ..count..) , vjust = -0.5 , size =4)+
labs(title = "default = 0 by kmgroups" , x = "Groups" , y= "count of default = 0")+
theme_minimal()
cluster_summary_kmeans <- df2 %>%
group_by(groupkm) %>%
summarise(
count = n(),
A_Max_TotalAmount_NegStat_L36M_Act_Main_mean = mean(A_Max_TotalAmount_NegStat_L36M_Act_Main),
A_Avg_TotalAmount_NegStat_L36M_Main_mean = mean(A_Avg_TotalAmount_NegStat_L36M_Main),
A_Max_TotalAmount_L36M_Main_mean = mean(A_Max_TotalAmount_L36M_Main),
A_Avg_TotalAmount_L36M_Main_mean = mean(A_Avg_TotalAmount_L36M_Main),
NumOf_Main_contract_NegStat_L12M_mean = mean(NumOf_Main_contract_NegStat_L12M),
NumOf_Main_contract_NoNegStat_L12M_min = mean(NumOf_Main_contract_NoNegStat_L12M),
SumBouncedAmount_HalfOrEmpty_InL36M_mean = mean(SumBouncedAmount_HalfOrEmpty_InL36M),
NumCheques_HalfOrEmpty_InL36M_mean = mean(NumCheques_HalfOrEmpty_InL36M),
A_Avg_TotalAmount_L36M_Main_mean = mean(A_Avg_TotalAmount_L36M_Main),
Age_mean = mean(Age),
percentile_mean = mean(percentile),
Gender_mean = mean(Gender),
default = mean(Default))
# Display the summary
cluster_summary_kmeans <- t(cluster_summary_kmeans)
print(cluster_summary_kmeans)
cluster_summary_kmeans <- as.data.frame(cluster_summary_kmeans)
set.seed(5)
objective_function <- function(params) {
k <- round(params)
if (k < 2) k <- 2
return(-calculate_wss(k, results))
}
calculate_wss <- function(k, results) {
kmeans_result <- kmeans(results, centers = k, nstart = 100)
return(kmeans_result$tot.withinss)
}
ga_result <- ga(
type = "real-valued",
fitness = objective_function,
lower = 1,
upper = 10,
run = 15,
parallel = TRUE,
monitor = if(interactive()) gaMonitor else FALSE
)
best_k <- round(ga_result@solution[1])
GA_kmeans <- kmeans(results, centers = best_k, nstart = 100)
Customers_Segments_ga <- data.frame(results, cluster = as.factor(GA_kmeans$cluster))
pga <- plot_ly(Customers_Segments_ga, x=~Customers_Segments_ga[,2], y=~Customers_Segments_ga[,3],
z=~Customers_Segments_ga[,4], color=~cluster) %>%
add_markers(size=1.5)
pga
GA_kmeans$size
GA_kmeans$centers
######################################
## Realization of clusters
df2$group_GA <- GA_kmeans$cluster
g1_GA<- df1[df1$group_GA==1,]
g2_GA<- df1[df1$group_GA==2,]
g3_GA<- df1[df1$group_GA==3,]
g4_GA<- df1[df1$group_GA==4,]
g5_GA<- df1[df1$group_GA==5,]
g6_GA<- df1[df1$group_GA==6,]
g7_GA<- df1[df1$group_GA==7,]
g8_GA<- df1[df1$group_GA==8,]
g9_GA<- df1[df1$group_GA==9,]
g10_GA<- df1[df1$group_GA==10,]
cluster_summary_GA <- df2 %>%
group_by(group_GA) %>%
summarise(
count = n(),
A_Max_TotalAmount_NegStat_L36M_Act_Main_mean = mean(A_Max_TotalAmount_NegStat_L36M_Act_Main),
A_Avg_TotalAmount_NegStat_L36M_Main_mean = mean(A_Avg_TotalAmount_NegStat_L36M_Main),
A_Max_TotalAmount_L36M_Main_mean = mean(A_Max_TotalAmount_L36M_Main),
A_Avg_TotalAmount_L36M_Main_mean = mean(A_Avg_TotalAmount_L36M_Main),
NumOf_Main_contract_NegStat_L12M_mean = mean(NumOf_Main_contract_NegStat_L12M),
NumOf_Main_contract_NoNegStat_L12M_min = mean(NumOf_Main_contract_NoNegStat_L12M),
SumBouncedAmount_HalfOrEmpty_InL36M_mean = mean(SumBouncedAmount_HalfOrEmpty_InL36M),
NumCheques_HalfOrEmpty_InL36M_mean = mean(NumCheques_HalfOrEmpty_InL36M),
A_Avg_TotalAmount_L36M_Main_mean = mean(A_Avg_TotalAmount_L36M_Main),
Age_mean = mean(Age),
percentile_mean = mean(percentile),
Gender_mean = mean(Gender),
groupkm_mean = mean(groupkm),
default = mean(Default))
group <- df2 %>%
group_by(group_GA , groupkm)%>%
summarise(count = n() , .groups = 'drop')
kable(group)
# Display the summary
cluster_summary_GA <- t(cluster_summary_GA)
print(cluster_summary_GA)
cluster_summary_GA <- as.data.frame(cluster_summary_GA)
# agglomative
# Dissimilarity matrix
df4 <- data.frame(results)
set.seed(9)
df5 <- df4 %>%
sample_n(100)
d <- dist(df5, method = "euclidean")
# Hierarchical clustering using Complete Linkage
hc1 <- hclust(d, method = "ward.D" )
hc2 <- hclust(d, method = "complete" )
fviz_nbclust(df5, FUN = hcut, method = "wss")
gap_stat <- clusGap(df5, FUN = hcut, nstart = 25, K.max = 10, B = 50)
fviz_gap_stat(gap_stat)
# Plot the obtained dendrogram
plot(hc1, cex = 0.6, hang = -1)
rect.hclust(hc1, k = 5, border = 2:5)
plot(hc2, cex = 0.6, hang = -1)
rect.hclust(hc2, k = 5, border = 2:5)
# Cut tree into 3 groups
sub_grp1 <- cutree(hc1, k = 5)
sub_grp2 <- cutree(hc2, k = 5)
# Number of members in each cluster
table(sub_grp1)
table(sub_grp2)
fviz_cluster(list(data = df5, cluster = sub_grp1))+
ggtitle("HCluster for Ward.D") +
theme_minimal()
fviz_cluster(list(data = df5, cluster = sub_grp2))+
ggtitle("HCluster for Complete") +
theme_minimal()
# Create two dendrograms
dend1 <- as.dendrogram (hc1)
dend2 <- as.dendrogram (hc2)
tanglegram(dend1, dend2)
dend_list <- dendlist(dend1, dend2)
tanglegram(dend1, dend2,
highlight_distinct_edges = FALSE,
common_subtrees_color_lines = FALSE,
common_subtrees_color_branches = TRUE,
main = paste("entanglement =", round(entanglement(dend_list), 2))
)
# divisive
# compute divisive hierarchical clustering
hc3 <- diana(df5)
# Divise coefficient; amount of clustering structure found
hc3$dc
# plot dendrogram
pltree(hc3, cex = 0.6, hang = -1, main = "Dendrogram of diana")
davies_bouldin_kmeans <- index.DB(results, km_res4$cluster, centrotypes="centroids")
davies_bouldin_GA <- index.DB(results, GA_kmeans$cluster, centrotypes="centroids")
# Davies-Bouldin Index values
davies_bouldin_kmeans_score <- davies_bouldin_kmeans$DB
davies_bouldin_GA_score <- davies_bouldin_GA$DB
# Create a data frame to store the results
comparison_table_resultcluster <- data.frame(
Method = c("kmeans" , "GA"),
Davies_Bouldin_Index = c(davies_bouldin_kmeans_score, davies_bouldin_GA_score)
)
# Display the table
knitr::kable(comparison_table_resultcluster, caption = "Comparison of Clustering Methods")
df <- df2[,1:33]
df$Default <- as.factor(df$Default)
llm <- llm(df[,1:32], df$Default, threshold_pruning = 0.85, nbr_obs_leaf = 3000)
plot(llm$`Full decision tree for segmentation`)
#####xgboost on all
set.seed(9)
index <- sample(2,size = nrow(df2), replace = T , prob=c(0.7,0.3))
train <- train_data[index == 1, ]
test <- train_data[index == 2, ]
# Convert the label from factor to numeric
train$Default <- as.numeric(as.character(train$Default))
test$Default <- as.numeric(as.character(test$Default))
trainxg_matrix <- xgb.DMatrix(data = as.matrix(train[, -33]), label = train$Default)
testxg_matrix <- xgb.DMatrix(data = as.matrix(test[, -33]), label = test$Default)
# Set parameters for XGBoost
params <- list(objective = "binary:logistic", eval_metric = "auc")
# Train the model
xgb_model <- xgboost(params = params, data = trainxg_matrix, nrounds = 100, verbose = 0)
# Make predictions
train_pred_xgb <- predict(xgb_model, trainxg_matrix)
test_pred_xgb <- predict(xgb_model, testxg_matrix)
# Calculate AUC for train and test data
auc_train_xgb <- auc(train$Default, train_pred_xgb)
auc_test_xgb <- auc(test$Default, test_pred_xgb)
# Confusion Matrix and Metrics
train_pred_class_xgb <- ifelse(train_pred_xgb > 0.5, 1, 0)
test_pred_class_xgb <- ifelse(test_pred_xgb > 0.5, 1, 0)
# Training metrics
confusion_matrix_test_xgb <- table(Predicted = test_pred_class_xgb, Actual = test$Default)
TP_test_xgb <- confusion_matrix_test_xgb["1", "1"]
FP_test_xgb <- confusion_matrix_test_xgb["1", "0"]
FN_test_xgb <- confusion_matrix_test_xgb["0", "1"]
TN_test_xgb <- confusion_matrix_test_xgb["0", "0"]
confusion_matrix_test_xgb
Specificity_test_xgb <- TN_test_xgb / (TN_test_xgb + FP_test_xgb)
Sensitivity_test_xgb <- TP_test_xgb / (TP_test_xgb + FN_test_xgb)
Precision_test_xgb <- TP_test_xgb / (TP_test_xgb + FP_test_xgb)
Specificity_test_xgb
Sensitivity_test_xgb
Precision_test_xgb
# ROC and AUC
roc_test <- roc(test$Default, test_pred_xgb)
# Plot ROC Curve
ggroc(roc_test) +
ggtitle("ROC Curve for Test Data") +
xlab("1 - Specificity") +
ylab("Sensitivity")
# AUC Calculation
auc_test <- auc(roc_test)
auc_test
hist(test_pred_xgb)
segment1<- df2[df2$groupkm==1,]
segment1 <- segment1[,1:33]
set.seed(1111)
index1 <- sample(2,size = nrow(segment1), replace = T , prob=c(0.7,0.3))
train1 <- segment1[index1 == 1, ]
test1 <- segment1[index1 == 2, ]
# Convert the label from factor to numeric
train1$Default <- as.numeric(as.character(train1$Default))
test1$Default <- as.numeric(as.character(test1$Default))
trainxg_matrix1 <- xgb.DMatrix(data = as.matrix(train1[, -33]), label = train1$Default)
testxg_matrix1 <- xgb.DMatrix(data = as.matrix(test1[, -33]), label = test1$Default)
# Set parameters for XGBoost
params <- list(objective = "binary:logistic", eval_metric = "auc")
# Train the model
xgb_model1 <- xgboost(params = params, data = trainxg_matrix1, nrounds = 100, verbose = 0)
# Make predictions
train_pred_xgb1 <- predict(xgb_model1, trainxg_matrix1)
test_pred_xgb1 <- predict(xgb_model1, testxg_matrix1)
# Calculate AUC for train and test data
auc_train_xgb1 <- auc(train1$Default, train_pred_xgb1)
auc_test_xgb1 <- auc(test1$Default, test_pred_xgb1)
auc_test_xgb1
# Confusion Matrix and Metrics
train_pred_class_xgb1 <- ifelse(train_pred_xgb1 > 0.5, 1, 0)
test_pred_class_xgb1 <- ifelse(test_pred_xgb1 > 0.5, 1, 0)
# Training metrics
confusion_matrix_test_xgb1 <- table(Predicted = test_pred_class_xgb1, Actual = test1$Default)
TP_test_xgb1 <- confusion_matrix_test_xgb1["1", "1"]
FP_test_xgb1 <- confusion_matrix_test_xgb1["1", "0"]
FN_test_xgb1 <- confusion_matrix_test_xgb1["0", "1"]
TN_test_xgb1 <- confusion_matrix_test_xgb1["0", "0"]
confusion_matrix_test_xgb1
Specificity_test_xgb1 <- TN_test_xgb1 / (TN_test_xgb1 + FP_test_xgb1)
Specificity_test_xgb1
Sensitivity_test_xgb1 <- TP_test_xgb1 / (TP_test_xgb1 + FN_test_xgb1)
Sensitivity_test_xgb1
Precision_test_xgb1 <- TP_test_xgb1 / (TP_test_xgb1 + FP_test_xgb1)
Precision_test_xgb1
# ROC and AUC - Training Data
roc_train1 <- roc(train1$Default, train_pred_xgb1)
roc_test1 <- roc(test1$Default, test_pred_xgb1)
# Plot ROC Curve
ggroc(roc_test1) +
ggtitle("ROC Curve for Test Data") +
xlab("1 - Specificity") +
ylab("Sensitivity")
# AUC Calculation
auc_test1 <- auc(roc_test1)
auc_test1
hist(test_pred_xgb1)
hist(train_pred_xgb1)
segment2<- df2[df2$groupkm==2,]
segment2 <- segment2[,1:33]
set.seed(2222)
index2 <- sample(2,size = nrow(segment2), replace = T , prob=c(0.7,0.3))
train2 <- segment2[index2 == 1, ]
test2 <- segment2[index2 == 2, ]
# Convert the label from factor to numeric
train2$Default <- as.numeric(as.character(train2$Default))
test2$Default <- as.numeric(as.character(test2$Default))
trainxg_matrix2 <- xgb.DMatrix(data = as.matrix(train2[, -33]), label = train2$Default)
testxg_matrix2 <- xgb.DMatrix(data = as.matrix(test2[, -33]), label = test2$Default)
# Set parameters for XGBoost
params <- list(objective = "binary:logistic", eval_metric = "auc")
# Train the model
xgb_model2 <- xgboost(params = params, data = trainxg_matrix2, nrounds = 100, verbose = 0)
# Make predictions
train_pred_xgb2 <- predict(xgb_model2, trainxg_matrix2)
test_pred_xgb2 <- predict(xgb_model2, testxg_matrix2)
# Calculate AUC for train and test data
auc_train_xgb2 <- auc(train2$Default, train_pred_xgb2)
auc_test_xgb2 <- auc(test2$Default, test_pred_xgb2)
auc_test_xgb2
# Confusion Matrix and Metrics
train_pred_class_xgb2 <- ifelse(train_pred_xgb2 > 0.5, 1, 0)
test_pred_class_xgb2 <- ifelse(test_pred_xgb2 > 0.5, 1, 0)
# Training metrics
confusion_matrix_test_xgb2 <- table(Predicted = test_pred_class_xgb2, Actual = test2$Default)
TP_test_xgb2 <- confusion_matrix_test_xgb2["1", "1"]
FP_test_xgb2 <- confusion_matrix_test_xgb2["1", "0"]
FN_test_xgb2 <- confusion_matrix_test_xgb2["0", "1"]
TN_test_xgb2 <- confusion_matrix_test_xgb2["0", "0"]
confusion_matrix_test_xgb2
Specificity_test_xgb2 <- TN_test_xgb2 / (TN_test_xgb2 + FP_test_xgb2)
Specificity_test_xgb2
Sensitivity_test_xgb2 <- TP_test_xgb2 / (TP_test_xgb2 + FN_test_xgb2)
Sensitivity_test_xgb2
Precision_test_xgb2 <- TP_test_xgb2 / (TP_test_xgb2 + FP_test_xgb2)
Precision_test_xgb2
# ROC and AUC - Training Data
roc_train2 <- roc(train2$Default, train_pred_xgb2)
roc_test2 <- roc(test2$Default, test_pred_xgb2)
# Plot ROC Curve
ggroc(roc_test2) +
ggtitle("ROC Curve for Test Data") +
xlab("1 - Specificity") +
ylab("Sensitivity")
# AUC Calculation
auc_test2 <- auc(roc_test2)
auc_test2
hist(test_pred_xgb2)
segment3<- df2[df2$groupkm==3,]
segment3 <- segment3[,1:33]
set.seed(3333)
index3 <- sample(2,size = nrow(segment3), replace = T , prob=c(0.7,0.3))
train3 <- segment3[index3 == 1, ]
test3 <- segment3[index3 == 2, ]
# Convert the label from factor to numeric
train3$Default <- as.numeric(as.character(train3$Default))
test3$Default <- as.numeric(as.character(test3$Default))
trainxg_matrix3 <- xgb.DMatrix(data = as.matrix(train3[, -33]), label = train3$Default)
testxg_matrix3 <- xgb.DMatrix(data = as.matrix(test3[, -33]), label = test3$Default)
# Set parameters for XGBoost
params <- list(objective = "binary:logistic", eval_metric = "auc")
# Train the model
xgb_model3 <- xgboost(params = params, data = trainxg_matrix3, nrounds = 100, verbose = 0)
# Make predictions
train_pred_xgb3 <- predict(xgb_model3, trainxg_matrix3)
test_pred_xgb3 <- predict(xgb_model3, testxg_matrix3)
# Calculate AUC for train and test data
auc_train_xgb3 <- auc(train3$Default, train_pred_xgb3)
auc_test_xgb3 <- auc(test3$Default, test_pred_xgb3)
auc_test_xgb3
# Confusion Matrix and Metrics
train_pred_class_xgb3 <- ifelse(train_pred_xgb3 > 0.5, 1, 0)
test_pred_class_xgb3 <- ifelse(test_pred_xgb3 > 0.5, 1, 0)
# Training metrics
confusion_matrix_test_xgb3 <- table(Predicted = test_pred_class_xgb3, Actual = test3$Default)
TP_test_xgb3 <- confusion_matrix_test_xgb3["1", "1"]
FP_test_xgb3 <- confusion_matrix_test_xgb3["1", "0"]
FN_test_xgb3 <- confusion_matrix_test_xgb3["0", "1"]
TN_test_xgb3 <- confusion_matrix_test_xgb3["0", "0"]
confusion_matrix_test_xgb3
Specificity_test_xgb3 <- TN_test_xgb3 / (TN_test_xgb3 + FP_test_xgb3)
Specificity_test_xgb3
Sensitivity_test_xgb3 <- TP_test_xgb3 / (TP_test_xgb3 + FN_test_xgb3)
Sensitivity_test_xgb3
Precision_test_xgb3 <- TP_test_xgb3 / (TP_test_xgb3 + FP_test_xgb3)
Precision_test_xgb3
# ROC and AUC - Training Data
roc_train3 <- roc(train3$Default, train_pred_xgb3)
roc_test3 <- roc(test3$Default, test_pred_xgb3)
# Plot ROC Curve
ggroc(roc_test3) +
ggtitle("ROC Curve for Test Data") +
xlab("1 - Specificity") +
ylab("Sensitivity")
# AUC Calculation
auc_test3 <- auc(roc_test3)
auc_test3
hist(test_pred_xgb3)
segment4<- df2[df2$groupkm==4,]
segment4 <- segment4[,1:33]
set.seed(4444)
index4 <- sample(2,size = nrow(segment4), replace = T , prob=c(0.7,0.3))
train4 <- segment4[index4 == 1, ]
test4 <- segment4[index4 == 2, ]
# Convert the label from factor to numeric
train4$Default <- as.numeric(as.character(train4$Default))
test4$Default <- as.numeric(as.character(test4$Default))
trainxg_matrix4 <- xgb.DMatrix(data = as.matrix(train4[, -33]), label = train4$Default)
testxg_matrix4 <- xgb.DMatrix(data = as.matrix(test4[, -33]), label = test4$Default)
# Set parameters for XGBoost
params <- list(objective = "binary:logistic", eval_metric = "auc")
# Train the model
xgb_model4 <- xgboost(params = params, data = trainxg_matrix4, nrounds = 100, verbose = 0)
# Make predictions
train_pred_xgb4 <- predict(xgb_model4, trainxg_matrix4)
test_pred_xgb4 <- predict(xgb_model4, testxg_matrix4)
# Calculate AUC for train and test data
auc_train_xgb4 <- auc(train4$Default, train_pred_xgb4)
auc_test_xgb4 <- auc(test4$Default, test_pred_xgb4)
#> Setting direction: controls < cases
auc_test_xgb4
# Training metrics
confusion_matrix_test_xgb4 <- table(Predicted = test_pred_class_xgb4, Actual = test4$Default)
TP_test_xgb4 <- confusion_matrix_test_xgb4["1", "1"]
FP_test_xgb4 <- confusion_matrix_test_xgb4["1", "0"]
FN_test_xgb4 <- confusion_matrix_test_xgb4["0", "1"]
TN_test_xgb4 <- confusion_matrix_test_xgb4["0", "0"]
confusion_matrix_test_xgb4
Specificity_test_xgb4 <- TN_test_xgb4 / (TN_test_xgb4 + FP_test_xgb4)
Specificity_test_xgb4
Sensitivity_test_xgb4 <- TP_test_xgb4 / (TP_test_xgb4 + FN_test_xgb4)
Sensitivity_test_xgb4
Precision_test_xgb4 <- TP_test_xgb4 / (TP_test_xgb4 + FP_test_xgb4)
Precision_test_xgb4
# ROC and AUC - Training Data
roc_train4 <- roc(train4$Default, train_pred_xgb4)
# Plot ROC Curve
ggroc(roc_test4) +
ggtitle("ROC Curve for Test Data") +
xlab("1 - Specificity") +
ylab("Sensitivity")
# AUC Calculation
auc_test4 <- auc(roc_test4)
auc_test4
hist(test_pred_xgb4)
count_seg1 <- as.numeric(count(segment1))
count_seg2 <- as.numeric(count(segment2))
count_seg3 <- as.numeric(count(segment3))
count_seg4 <- as.numeric(count(segment4))
# Create a data frame with the accuracy metrics for each group
results_xgb <- data.frame(
Group = c("Group 1", "Group 2", "Group 3", "Group 4"),
AUC_Test = c(auc_test_xgb1, auc_test_xgb2, auc_test_xgb3, auc_test_xgb4),
Specificity = c(Specificity_test_xgb1, Specificity_test_xgb2, Specificity_test_xgb3, Specificity_test_xgb4),
count = c(count_seg1,count_seg2,count_seg3,count_seg4)
)
# Print the results table
print(results_xgb)
set.seed(9)
dfn <- df2[1:33]
index0 <- sample(2,size = nrow(dfn), replace = T , prob=c(0.7,0.3))
df_train <- dfn[index == 1, ]
df_test <- dfn[index == 2, ]
logit1 <- glm(formula = Default ~ ., family = stats::binomial("logit"), data = df_train)
summary(logit1)
# Predictions on training and test data
train_pred1 <- predict(logit1, df_train, type = 'response')
test_pred1 <- predict(logit1, df_test, type = 'response')
df_test$test_pred1 <- test_pred1
df_train$train_pred1 <- train_pred1
# Optimal cutoff determination
pred1 <- prediction(test_pred1, df_test$Default)
perform1 <- performance(pred1, "acc")
max1 <- which.max(slot(perform1, "y.values")[[1]])
prob1 <- slot(perform1, "x.values")[[1]][max1]
prob1
# AUC calculation for training data
auc_bin1 <- performance(pred1, "auc")
auc_bin1 <- unlist(slot(auc_bin1, "y.values"))
auc_bin1
# Confusion Matrix - Training Data
test_pred_class <- ifelse(test_pred1 > prob1, 1, 0)
tble_cf <- table(Predicted = test_pred_class, Actual = df_test$Default)
Sensitivity <- TP / (TP + FN)
Sensitivity
Precision <- TP / (TP + FP)
Precision
classification_metrics_test <- data.frame(Specificity, Sensitivity, Precision)
classification_metrics_test
# ROC and AUC - Training Data
roc_test <- roc(df_test$Default, test_pred1)
# Plot ROC Curve
ggroc(roc_test) +
ggtitle("ROC Curve for Test Data") +
xlab("1 - Specificity") +
ylab("Sensitivity")

# AUC Calculation
auc_test <- auc(roc_test)
auc_test
logit_Seg2 <- glm(formula = Default ~ ., family = stats::binomial("logit"), data = train2[1:33])
summary(logit_Seg2)
# Predictions on training and test data
trainpred_seg2 <- predict(logit_Seg2, train2, type = 'response')
testpred_seg2 <- predict(logit_Seg2, test2, type = 'response')
test2$testpred_seg2 <- testpred_seg2
train2$trainpred_seg2 <- trainpred_seg2
# Optimal cutoff determination
pred2_seg <- prediction(testpred_seg2, test2$Default)
performseg2 <- performance(pred2_seg, "acc")
max2 <- which.max(slot(performseg2, "y.values")[[1]])
probseg2 <- slot(performseg2, "x.values")[[1]][max1]
probseg2
# Confusion Matrix - Training Data
testseg_pred_class <- ifelse(testpred_seg2 > prob1, 1, 0)
tble_testseg2 <- table(Predicted = testseg_pred_class, Actual = test2$Default)
tble_testseg2
# Classification Table - Training Data
TPseg2 <- tble_testseg2["1", "1"]
FPseg2 <- tble_testseg2["1", "0"]
FNseg2 <- tble_testseg2["0", "1"]
TNseg2 <- tble_testseg2["0", "0"]
Specificityseg2 <- TNseg2 / (TNseg2 + FPseg2)
Sensitivityseg2 <- TPseg2 / (TPseg2 + FNseg2)
Precisionseg2 <- TPseg2 / (TPseg2 + FPseg2)
classification_metrics_testseg2 <- data.frame(Specificityseg2, Sensitivityseg2, Precisionseg2)
classification_metrics_testseg2
# ROC and AUC - Training Data
roc_testseg2 <- roc(test2$Default, testpred_seg2)
# Plot ROC Curve
ggroc(roc_testseg2) +
ggtitle("ROC Curve for Testseg Data") +
xlab("1 - Specificityseg") +
ylab("Sensitivityseg")
# AUC Calculation
auc_testseg2 <- auc(roc_testseg2)
auc_testseg2
hist(testpred_seg2)
#### Group3
logit_Seg3 <- glm(formula = Default ~ ., family = stats::binomial("logit"), data = train3[1:33])
summary(logit_Seg3)
# Predictions on training and test data
trainpred_seg3 <- predict(logit_Seg3, train3, type = 'response')
testpred_seg3 <- predict(logit_Seg3, test3, type = 'response')
test3$testpred_seg3 <- testpred_seg3
train3$trainpred_seg3 <- trainpred_seg3
# Optimal cutoff determination
pred3_seg <- prediction(testpred_seg3, test3$Default)
performseg3 <- performance(pred3_seg, "acc")
max3 <- which.max(slot(performseg3, "y.values")[[1]])
probseg3 <- slot(performseg3, "x.values")[[1]][max1]
probseg3
# Confusion Matrix - Training Data
testseg_pred_class <- ifelse(testpred_seg3 > prob1, 1, 0)
tble_testseg3 <- table(Predicted = testseg_pred_class, Actual = test3$Default)
tble_testseg3
# Classification Table - Training Data
TPseg3 <- tble_testseg3["1", "1"]
FPseg3 <- tble_testseg3["1", "0"]
FNseg3 <- tble_testseg3["0", "1"]
TNseg3 <- tble_testseg3["0", "0"]
Specificityseg3 <- TNseg3 / (TNseg3 + FPseg3)
Sensitivityseg3 <- TPseg3 / (TPseg3 + FNseg3)
Precisionseg3 <- TPseg3 / (TPseg3 + FPseg3)
classification_metrics_testseg3 <- data.frame(Specificityseg3, Sensitivityseg3, Precisionseg3)
classification_metrics_testseg3
# ROC and AUC - Training Data
roc_testseg3 <- roc(test3$Default, testpred_seg3)
# Plot ROC Curve
ggroc(roc_testseg3) +
ggtitle("ROC Curve for Testseg Data") +
xlab("1 - Specificityseg") +
ylab("Sensitivityseg")
# AUC Calculation
auc_testseg3 <- auc(roc_testseg3)
auc_testseg3
hist(testpred_seg3)
logit_Seg4 <- glm(formula = Default ~ ., family = stats::binomial("logit"), data = train4[1:33])
summary(logit_Seg4)
# Predictions on training and test data
trainpred_seg4 <- predict(logit_Seg4, train4, type = 'response')
testpred_seg4 <- predict(logit_Seg4, test4, type = 'response')
test4$testpred_seg4 <- testpred_seg4
train4$trainpred_seg4 <- trainpred_seg4
# Optimal cutoff determination
pred4_seg <- prediction(testpred_seg4, test4$Default)
performseg4 <- performance(pred4_seg, "acc")
max4 <- which.max(slot(performseg4, "y.values")[[1]])
probseg4 <- slot(performseg4, "x.values")[[1]][max1]
probseg4
# Confusion Matrix - Training Data
testseg_pred_class <- ifelse(testpred_seg4 > prob1, 1, 0)
tble_testseg4 <- table(Predicted = testseg_pred_class, Actual = test4$Default)
tble_testseg4
# Classification Table - Training Data
TPseg4 <- tble_testseg4["1", "1"]
FPseg4 <- tble_testseg4["1", "0"]
FNseg4 <- tble_testseg4["0", "1"]
TNseg4 <- tble_testseg4["0", "0"]
Specificityseg4 <- TNseg4 / (TNseg4 + FPseg4)
Sensitivityseg4 <- TPseg4 / (TPseg4 + FNseg4)
Precisionseg4 <- TPseg4 / (TPseg4 + FPseg4)
classification_metrics_testseg4 <- data.frame(Specificityseg4, Sensitivityseg4, Precisionseg4)
classification_metrics_testseg4
# ROC and AUC - Training Data
roc_testseg4 <- roc(test4$Default, testpred_seg4)
# Plot ROC Curve
ggroc(roc_testseg4) +
ggtitle("ROC Curve for Testseg Data") +
xlab("1 - Specificityseg") +
ylab("Sensitivityseg")
# AUC Calculation
auc_testseg4 <- auc(roc_testseg4)
auc_testseg4
hist(testpred_seg4)
count_seg1 <- as.numeric(count(segment1))
count_seg2 <- as.numeric(count(segment2))
count_seg3 <- as.numeric(count(segment3))
count_seg4 <- as.numeric(count(segment4))
# Create a data frame with the accuracy metrics for each group
results_logitmodel <- data.frame(
Group = c("Group 2", "Group 3", "Group 4"),
AUC_Test = c(auc_testseg2, auc_testseg3, auc_testseg4),
Specificity = c(Specificityseg2, Specificityseg3, Specificityseg4),
count = c(count_seg2,count_seg3,count_seg4)
)
# Print the results table
print(results_logitmodel)
# Combine the results from both models into one data frame
combined_results_models <- data.frame(
Model = rep(c("logit", "XGBoost"), each = 3),
Group = rep(c("Group 2", "Group 3", "Group 4"), times = 2),
AUC_Test = c(auc_testseg2, auc_testseg3, auc_testseg4,
auc_test_xgb2, auc_test_xgb3, auc_test_xgb4),
Specificity = c(Specificityseg2, Specificityseg3, Specificityseg4,
Specificity_test_xgb2, Specificity_test_xgb3, Specificity_test_xgb4),
Count = c(count_seg2, count_seg3, count_seg4)
)
# Print the combined results table
print(combined_results_models)
ranking_cluster <- df2 %>%
group_by(groupkm) %>%
summarise(
A_Avg_TotalAmount_NegStat_L36M_Main = mean(A_Avg_TotalAmount_NegStat_L36M_Main),
NContrWithAPastDueL36M = mean(NContrWithAPastDueL36M),
DSinceADueActH1 = mean(DSinceADueActH1),
groups = 'drop'
) %>%
mutate(
composite_score = - A_Avg_TotalAmount_NegStat_L36M_Main - NContrWithAPastDueL36M -
DSinceADueActH1
) %>%
arrange(desc(composite_score))
print(ranking_cluster)